package com.webcat.utils;

import org.apache.tika.Tika;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;

public class WordUtil {


    public static String parseWord(InputStream in) {
        try {
            Tika tika = new Tika();
            String content = tika.parseToString(in);
            content = content.replaceAll("[&nbsp;]+","");//将多个空格替换成掉
            content = content.replaceAll("[<br>]*","").replaceAll("(?m)^\\s*$(\\n|\\r\\n)", "   ");//去掉空行
            return content;
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static void main(String... args) throws Exception {
        FileInputStream inputStream = new FileInputStream(new File("D:\\0226陕西交控对外统一服务平台设计文件.pdf"));
        System.out.println(parseWord(inputStream));
    }
}
